# R for Data Science Chapter 5
# Exploratory Data Analysis
# Daniel J. Vera, Ph.D.
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
library(nycflights13)
ggplot(data = diamonds) +
geom_bar(mapping = aes(x = cut))

diamonds %>%
count(cut)
## # A tibble: 5 × 2
## cut n
## <ord> <int>
## 1 Fair 1610
## 2 Good 4906
## 3 Very Good 12082
## 4 Premium 13791
## 5 Ideal 21551
ggplot(data = diamonds) +
geom_histogram(mapping = aes(x = carat), binwidth = 0.5)

diamonds %>%
count(cut_width(carat, 0.5))
## # A tibble: 11 × 2
## `cut_width(carat, 0.5)` n
## <fctr> <int>
## 1 [-0.25,0.25] 785
## 2 (0.25,0.75] 29498
## 3 (0.75,1.25] 15977
## 4 (1.25,1.75] 5313
## 5 (1.75,2.25] 2002
## 6 (2.25,2.75] 322
## 7 (2.75,3.25] 32
## 8 (3.25,3.75] 5
## 9 (3.75,4.25] 4
## 10 (4.25,4.75] 1
## 11 (4.75,5.25] 1
smaller_diamonds <- diamonds %>%
filter(carat < 3)
ggplot(data = smaller_diamonds, mapping = aes(x = carat)) +
geom_histogram(binwidth = 0.1)

ggplot(data = smaller_diamonds, mapping = aes(x = carat)) +
geom_histogram(binwidth = 0.01)

ggplot(data = smaller_diamonds, mapping = aes(x = carat, color = cut)) +
geom_freqpoly(binwidth = 0.1)

ggplot(data = faithful, mapping = aes(x = eruptions)) +
geom_histogram(binwdith = 0.25)
## Warning: Ignoring unknown parameters: binwdith
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = diamonds) +
geom_histogram(mapping = aes(x = y), binwidth = 0.5) +
coord_cartesian(ylim = c(0, 50))

unusual <- diamonds %>%
# trying to pick up on unusual values in previous graph
filter(y < 3 | y > 20) %>%
arrange(y)
# Exercises 7.3.4 on website
#===============================================================
# Explore the distribution of each of the x, y, and z variables
# in diamonds. What do you learn? Think about a diamond and how
# you might decide which dimension is the length, width, and depth.
ggplot(data = diamonds) +
geom_histogram(mapping = aes(x = x), binwidth = 0.5)

ggplot(data = diamonds) +
geom_histogram(mapping = aes(x = y), binwidth = 0.5)

ggplot(data = diamonds) +
geom_histogram(mapping = aes(x = z), binwidth = 0.5)

# Explore the distribution of price. Do you discover anything unusual
# or surprising?
# (Hint: Carefully think about the binwidth and make sure you try a
# wide range of values.)
ggplot(data = diamonds, mapping = aes(x = price)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = diamonds, mapping = aes(x = price)) +
geom_histogram(binwidth = 100)

ggplot(data = diamonds, mapping = aes(x = price)) +
geom_histogram(binwidth = 100) +
coord_cartesian(xlim = c(0, 5000))

diamonds %>%
filter(price < 1500) %>%
count()
## # A tibble: 1 × 1
## n
## <int>
## 1 20010
diamonds %>%
filter(price > 1500) %>%
count()
## # A tibble: 1 × 1
## n
## <int>
## 1 33930
# How many diamonds are 0.99 carat? How many are 1 carat?
# What do you think is the cause of the difference?
ggplot(data = diamonds, mapping = aes(x = carat)) +
geom_histogram(binwidth = .01) +
coord_cartesian(xlim = c(0.90, 1.1))

diamonds %>%
filter(carat == 1) %>%
count()
## # A tibble: 1 × 1
## n
## <int>
## 1 1558
diamonds %>%
filter(carat == 0.99) %>%
count()
## # A tibble: 1 × 1
## n
## <int>
## 1 23
# most likely the 0.99 are ones made with the intention of being
# 1 carat and there is some error that makes it 0.99 or there is
# a measurement error another reason from online solutions:
# Around 1500 diamonds are 1.001.00 carat, compared to around 30
# or so diamonds at .99.99 carat. This could occur because prospective
# buyers of diamonds, if they are already willing to buy a .99.99 carat
# diamond will decide it is more aesthetically pleasing to say they
# bought a 11 carat diamond so there is less demand for .99.99 carat
# diamonds.
# Compare and contrast coord_cartesian() vs xlim() or ylim() when
# zooming in on a histogram. What happens if you leave binwidth unset?
# What happens if you try and zoom so only half a bar shows?
ggplot(data = diamonds, mapping = aes(x = carat)) +
geom_histogram(binwidth = .01) +
coord_cartesian(xlim = c(0.90, 1.1))

ggplot(data = diamonds, mapping = aes(x = carat)) +
geom_histogram(binwidth = .01) +
xlim(0.90, 1.1)
## Warning: Removed 43609 rows containing non-finite values (stat_bin).

ggplot(data = diamonds, mapping = aes(x = carat)) +
geom_histogram() +
coord_cartesian(xlim = c(0.90, 1.1))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = diamonds, mapping = aes(x = carat)) +
geom_histogram() +
xlim(0.90, 1.1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 43609 rows containing non-finite values (stat_bin).
## Warning: Removed 1 rows containing missing values (geom_bar).

#===============================================================
diamonds2 <- diamonds %>%
mutate(y = ifelse(y < 3 | y > 20, NA, y))
ggplot(data = diamonds2, mapping = aes(x = x, y = y)) +
geom_point()
## Warning: Removed 9 rows containing missing values (geom_point).

ggplot(data = diamonds2, mapping = aes(x = x, y = y)) +
geom_point(na.rm = TRUE)

# Exercises 7.4.1 on website
#===============================================================
# What happens to missing values in a histogram? What happens to
# missing values in a bar chart? Why is there a difference?
ggplot(data = flights, mapping = aes(x = dep_delay)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 8255 rows containing non-finite values (stat_bin).

flights %>%
mutate(carrier = ifelse(carrier == "AA", NA, carrier)) %>%
ggplot(aes(carrier)) +
geom_bar()

# Histograms omit missing values, bar charts draw them as another
# category
# What does na.rm = TRUE do in mean() and sum()?
# It removes the NA missing values before computing.
#===============================================================
ggplot(data = diamonds, mapping = aes(x = price)) +
geom_freqpoly(mapping = aes(color = cut), binwidth = 500)

ggplot(data = diamonds, mapping = aes(x = cut)) +
geom_bar()

ggplot(
data = diamonds,
mapping = aes(x = price, y = ..density..)
) +
geom_freqpoly(mapping = aes(color = cut), binwidth = 500)

ggplot(data = diamonds, mapping = aes(x = cut, y = price)) +
geom_boxplot()

ggplot(data = mpg) +
geom_boxplot(
mapping = aes(
x = reorder(class, hwy, FUN = median),
y = hwy
)
) +
coord_flip()

# Exercises 7.5.1.1 on website
#===============================================================
# Use what you’ve learned to improve the visualisation of the
# departure times of cancelled vs. non-cancelled flights.
nycflights13::flights %>%
mutate(
cancelled = is.na(dep_time),
sched_hour = sched_dep_time %/% 100,
sched_min = sched_dep_time %% 100,
sched_dep_time = sched_hour + sched_min / 60
) %>%
ggplot(mapping = aes(sched_dep_time, y = ..density..)) +
geom_freqpoly(
mapping = aes(colour = cancelled),
size = 1.5,
binwidth = 1/4)

# What variable in the diamonds dataset is most important for predicting the
# price of a diamond? How is that variable correlated with cut? Why does the
# combination of those two relationships lead to lower quality diamonds
# being more expensive?
diamonds_numeric <- diamonds[-c(2, 3, 4)]
cor(diamonds_numeric)
## carat depth table price x y
## carat 1.00000000 0.02822431 0.1816175 0.9215913 0.97509423 0.95172220
## depth 0.02822431 1.00000000 -0.2957785 -0.0106474 -0.02528925 -0.02934067
## table 0.18161755 -0.29577852 1.0000000 0.1271339 0.19534428 0.18376015
## price 0.92159130 -0.01064740 0.1271339 1.0000000 0.88443516 0.86542090
## x 0.97509423 -0.02528925 0.1953443 0.8844352 1.00000000 0.97470148
## y 0.95172220 -0.02934067 0.1837601 0.8654209 0.97470148 1.00000000
## z 0.95338738 0.09492388 0.1509287 0.8612494 0.97077180 0.95200572
## z
## carat 0.95338738
## depth 0.09492388
## table 0.15092869
## price 0.86124944
## x 0.97077180
## y 0.95200572
## z 1.00000000
lm_diamonds = lm(price ~ ., data = diamonds)
summary(lm_diamonds)
##
## Call:
## lm(formula = price ~ ., data = diamonds)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21376.0 -592.4 -183.5 376.4 10694.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5753.762 396.630 14.507 < 2e-16 ***
## carat 11256.978 48.628 231.494 < 2e-16 ***
## cut.L 584.457 22.478 26.001 < 2e-16 ***
## cut.Q -301.908 17.994 -16.778 < 2e-16 ***
## cut.C 148.035 15.483 9.561 < 2e-16 ***
## cut^4 -20.794 12.377 -1.680 0.09294 .
## color.L -1952.160 17.342 -112.570 < 2e-16 ***
## color.Q -672.054 15.777 -42.597 < 2e-16 ***
## color.C -165.283 14.725 -11.225 < 2e-16 ***
## color^4 38.195 13.527 2.824 0.00475 **
## color^5 -95.793 12.776 -7.498 6.59e-14 ***
## color^6 -48.466 11.614 -4.173 3.01e-05 ***
## clarity.L 4097.431 30.259 135.414 < 2e-16 ***
## clarity.Q -1925.004 28.227 -68.197 < 2e-16 ***
## clarity.C 982.205 24.152 40.668 < 2e-16 ***
## clarity^4 -364.918 19.285 -18.922 < 2e-16 ***
## clarity^5 233.563 15.752 14.828 < 2e-16 ***
## clarity^6 6.883 13.715 0.502 0.61575
## clarity^7 90.640 12.103 7.489 7.06e-14 ***
## depth -63.806 4.535 -14.071 < 2e-16 ***
## table -26.474 2.912 -9.092 < 2e-16 ***
## x -1008.261 32.898 -30.648 < 2e-16 ***
## y 9.609 19.333 0.497 0.61918
## z -50.119 33.486 -1.497 0.13448
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1130 on 53916 degrees of freedom
## Multiple R-squared: 0.9198, Adjusted R-squared: 0.9198
## F-statistic: 2.688e+04 on 23 and 53916 DF, p-value: < 2.2e-16
lm_diamonds_carat = lm(price ~ carat, data = diamonds)
summary(lm_diamonds_carat)
##
## Call:
## lm(formula = price ~ carat, data = diamonds)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18585.3 -804.8 -18.9 537.4 12731.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2256.36 13.06 -172.8 <2e-16 ***
## carat 7756.43 14.07 551.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1549 on 53938 degrees of freedom
## Multiple R-squared: 0.8493, Adjusted R-squared: 0.8493
## F-statistic: 3.041e+05 on 1 and 53938 DF, p-value: < 2.2e-16
# carat looks like the most highly correlated.
ggplot(data = diamonds, mapping = aes(x = carat, y = price)) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data = diamonds, mapping = aes(x = cut, y = carat)) +
geom_boxplot()

# it looks like on average, ideal cuts have lower carat and since
# carat is variable with highest correlation to price, ideal cuts
# will be cheaper, on average.
# Install the ggstance package, and create a horizontal boxplot.
# How does this compare to using coord_flip()?
ggplot(data = diamonds, mapping = aes(x = cut, y = carat)) +
geom_boxplot() +
coord_flip()

# install.packages("ggstance")
library(ggstance) # should go in beginning but see next comment
##
## Attaching package: 'ggstance'
## The following objects are masked from 'package:ggplot2':
##
## geom_errorbarh, GeomErrorbarh
#bad form generally in scripts to install packages but for HW
ggplot(data = diamonds, mapping = aes(x = carat, y = cut)) +
geom_boxploth()

# To create a horizontal layer in ggplot2 with coord_flip(), you
# have to supply aesthetics as if they were to be drawn vertically
# then flip with coord_flip()
# In ggstance you do it as if drawn horizontally, see above.
# One problem with boxplots is that they were developed in an era of
# much smaller datasets and tend to display a prohibitively large number
# of “outlying values”. One approach to remedy this problem is the
# letter value plot.
# Install the lvplot package, and try using geom_lv() to display the
# distribution of price vs cut. What do you learn? How do you interpret
# the plots?
#install.packages("lvplot") # see above comment on installing packages
library(lvplot) # should at begining but see above
ggplot(data = diamonds, mapping = aes(x = cut, y = price)) +
geom_boxplot()

ggplot(data = diamonds, mapping = aes(x = cut, y = price)) +
geom_lv()

# Compare and contrast geom_violin() with a facetted geom_histogram(),
# or a coloured geom_freqpoly().
# What are the pros and cons of each method?
ggplot(data = diamonds) +
geom_histogram(mapping = aes(x = price)) +
facet_grid(cut ~ .)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = diamonds,
mapping = aes(x = price, y = ..density.., color = cut)) +
geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = diamonds) +
geom_violin(mapping = aes(x = cut, y = price)) +
coord_flip()

# If you have a small dataset, it’s sometimes useful to use geom_jitter()
# to see the relationship between a continuous and categorical variable.
# The ggbeeswarm package provides a number of methods similar to
# geom_jitter(). List them and briefly describe what each one does.
#install.packages("ggbeeswarm")
library(ggbeeswarm)
?ggbeeswarm
ggplot(data = mpg) +
geom_quasirandom(mapping = aes(x = reorder(class, hwy, FUN = median),
y = hwy))

ggplot(data = mpg) +
geom_beeswarm(mapping = aes(x = reorder(class, hwy, FUN = median),
y = hwy))

# The beeswarm geom is a convenient means to offset points within categories
# to reduce overplotting. Uses the beeswarm package
# The quasirandom geom is a convenient means to offset points within
# categories to reduce overplotting. Uses the vipor package.
# Both similar to geom_jitter()
#===============================================================
ggplot(data = diamonds) +
geom_count(mapping = aes(x = cut, y = color))

diamonds %>%
count(color, cut) %>%
ggplot(mapping = aes(x = color, y = cut)) +
geom_tile(mapping = aes(fill = n))

# Exercises 7.5.2.1 on website
#===============================================================
# How could you rescale the count dataset above to more clearly show
# the distribution of cut within color, or color within cut?
# calculate proportion
diamonds %>%
count(cut, color) %>%
group_by(color) %>%
mutate(prop = n/sum(n)) %>%
ggplot(mapping = aes(x = color, y = cut)) +
geom_tile(mapping = aes(fill = prop))

diamonds %>%
count(cut, color) %>%
group_by(cut) %>%
mutate(prop = n/sum(n)) %>%
ggplot(mapping = aes(x = color, y = cut)) +
geom_tile(mapping = aes(fill = prop))

# Use geom_tile() together with dplyr to explore how average flight
# delays vary by destination and month of year.
# What makes the plot difficult to read? How could you improve it?
flights %>%
group_by(month, dest) %>%
mutate(avg_delay = mean(arr_delay, na.rm = TRUE)) %>%
ggplot(mapping = aes(x = factor(month), y = dest)) +
geom_tile(mapping = aes(fill = avg_delay)) +
labs(x = "Month", y = "Destination", fill = "Average Delay")

# Why is it slightly better to use aes(x = color, y = cut) rather than
# aes(x = cut, y = color) in the example above?
diamonds %>%
count(color, cut) %>%
ggplot(mapping = aes(x = color, y = cut)) +
geom_tile(mapping = aes( fill = n))

diamonds %>%
count(color, cut) %>%
ggplot(mapping = aes(x = cut, y = color)) +
geom_tile(mapping = aes( fill = n))

# easier to read longer labels in y axis
# from solutions on web:
# Another justification, for switching the order is that the larger
# numbers are at the top when x = color and y = cut, and that lowers
# the cognitive burden of interpreting the plot.
#===============================================================
ggplot(data = diamonds) +
geom_point(mapping = aes(x = carat, y = price))

ggplot(data = diamonds) +
geom_point(
mapping = aes(x = carat, y = price),
alpha = 1/100
)

ggplot(data = smaller_diamonds) +
geom_bin2d(mapping = aes(x = carat, y = price))

#install.packages("hexbin")
library(hexbin)
ggplot(data = smaller_diamonds) +
geom_hex(mapping = aes(x = carat, y = price))

ggplot(data = smaller_diamonds, mapping = aes(x = carat, y = price)) +
geom_boxplot(mapping = aes(group = cut_width(carat, 0.1)))

ggplot(data = smaller_diamonds, mapping = aes(x = carat, y = price)) +
geom_boxplot(mapping = aes(group = cut_number(carat, 20)))

# Exercises 7.5.3.1 on website
#===============================================================
# Instead of summarising the conditional distribution with a boxplot,
# you could use a frequency polygon. What do you need to consider
# when using cut_width() vs cut_number()? How does that impact a
# visualisation of the 2d distribution of carat and price?
ggplot(data = diamonds) +
geom_freqpoly(
mapping = aes(x = price, color = cut_width(carat, 0.3))
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = diamonds) +
geom_freqpoly(
mapping = aes(x = price,
y = ..density..,
color = cut_width(carat, 0.3))
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = diamonds) +
geom_freqpoly(
mapping = aes(x = price, color = cut_number(carat, 10))
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = diamonds) +
geom_freqpoly(
mapping = aes(x = price,
y = ..density..,
color = cut_number(carat, 10))
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Visualise the distribution of carat, partitioned by price.
ggplot(data = diamonds, mapping = aes(x = cut_number(price, 10), y = carat)) +
geom_boxplot() +
coord_flip() +
xlab("Price")

ggplot(diamonds, aes(x = cut_width(price, 2000, boundary = 0), y = carat)) +
geom_boxplot(varwidth = TRUE) +
coord_flip() +
xlab("Price")

# How does the price distribution of very large diamonds compare to
# small diamonds. Is it as you expect, or does it surprise you?
ggplot(data = diamonds, mapping = aes(x = carat, y = price)) +
geom_boxplot(mapping = aes(group = cut_width(carat, .3)))

# More variance in large diamonds.
# Combine two of the techniques you’ve learned to visualise the combined
# distribution of cut, carat, and price.
# Instead of summarizing the conditional distribution with a boxplot,
# you could use a frequency polygon. What do you need to consider when
# using cut_width() versus cut_number()? How does that impact a
# visualization of the 2D distribution of carat and price?
ggplot(data = diamonds,
mapping = aes(x = carat, y = price)) +
geom_boxplot(mapping = aes(group = cut_width(carat, 0.3)))

ggplot(data = diamonds,
mapping = aes(x = carat, y = price)) +
geom_boxplot(mapping = aes(group = cut_number(carat, 10)))

ggplot(data = diamonds, mapping = aes(x = price)) +
geom_freqpoly(mapping = aes(color = cut_width(carat, 0.3)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = diamonds, mapping = aes(x = price)) +
geom_freqpoly(mapping = aes(color = cut_number(carat, 10)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# You need to pay attention to the width of carat you are looking at
# or the number of bins.
# Visualize the distribution of carat, partitioned by price.
ggplot(data = diamonds) +
geom_boxplot(mapping = aes(x = cut_number(price, 10), y = carat)) +
coord_flip()

ggplot(data = diamonds) +
geom_lv(mapping = aes(x = cut_number(price, 10), y = carat)) +
coord_flip()

# How does the price distribution of very large diamonds compare to
# small diamonds. Is it as you expect, or does it surprise you?
# Combine two of the techniques youâve learned to visualize the combined
# distribution of cut, carat, and price.
# Two-dimensional plots reveal outliers that are not visible in
# one-dimensional plots. For example, some points in the following plot
# have an unusual combination of x and y values, which makes the points
# outliers even though their x and y values appear normal when examined
# separately:
ggplot(data = diamonds) +
geom_point( mapping = aes(x = x, y = y)) +
coord_cartesian( xlim = c(4, 11), ylim = c(4, 11))

# Why is a scatterplot a better display than a binned plot for this case?